library(lubridate)
library(ggplot2)
library(tidyverse)
library(data.table)
library(lubridate)
library(stringr)
library(ggplot2)
library(plotly)
library(corrplot)
features_data_set <- read.csv2("Features data set.csv", header = TRUE, sep = ",")
sales_data_set <- read.csv2("sales data-set.csv", header = TRUE, sep = ",")
stores_data_set <- read.csv2("stores data-set.csv", header = TRUE, sep = ",")
str(features_data_set)
## 'data.frame': 8190 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Factor w/ 182 levels "01/02/2013","01/03/2013",..: 25 67 109 151 26 68 110 152 8 50 ...
## $ Temperature : Factor w/ 4178 levels "-2.06","-6.08",..: 1066 864 934 1308 1300 1998 1797 1601 2315 2579 ...
## $ Fuel_Price : Factor w/ 1011 levels "2.472","2.513",..: 16 10 3 12 43 58 88 95 87 121 ...
## $ MarkDown1 : Factor w/ 4023 levels "-16.93","-2781.45",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : Factor w/ 2715 levels "-0.01","-0.05",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : Factor w/ 2885 levels "-0.2","-0.73",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : Factor w/ 3405 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : Factor w/ 4045 levels "-185.17","-37.02",..: NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : Factor w/ 2505 levels "126.064","126.0766452",..: 1124 1143 1148 1150 1154 1157 1139 1115 1096 1084 ...
## $ Unemployment: Factor w/ 404 levels "10.064","10.115",..: 286 286 286 286 286 286 286 286 252 252 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
str(sales_data_set)
## 'data.frame': 421570 obs. of 5 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Factor w/ 143 levels "01/04/2011","01/06/2012",..: 20 53 86 119 21 54 87 120 6 39 ...
## $ Weekly_Sales: Factor w/ 359464 levels "-0.02","-0.04",..: 140272 237401 220814 101950 120302 114885 122532 148007 272542 225936 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
str(stores_data_set)
## 'data.frame': 45 obs. of 3 variables:
## $ Store: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 2 1 2 1 2 1 2 2 ...
## $ Size : int 151315 202307 37392 205863 34875 202505 70713 155078 125833 126512 ...
features_data_set$Year <- substr(features_data_set$Date, 7, 10)
features_data_set$Month <- substr(features_data_set$Date, 4, 5)
features_data_set$Day <- substr(features_data_set$Date, 1, 2)
sales_data_set$Year <- substr(sales_data_set$Date, 7, 10)
sales_data_set$Month <- substr(sales_data_set$Date, 4, 5)
sales_data_set$Day <- substr(sales_data_set$Date, 1, 2)
sales_data_set$Weekly_Sales <- as.character(sales_data_set$Weekly_Sales)
sales_data_set$Weekly_Sales <- as.numeric(sales_data_set$Weekly_Sales,2)
ggplot(stores_data_set, aes(Type, fill = Type ) ) +
geom_bar() +
xlab("Type of Store") + ylab("Count of Store")
YearSales <- sales_data_set %>% group_by(Year) %>% summarise(YearSales = sum(Weekly_Sales))
ggplot(YearSales, aes(Year, YearSales)) +
geom_col()
SalesStore <- left_join(sales_data_set, stores_data_set, by = "Store")
ggplot(SalesStore, aes(Type, Size) ,log = "xy") +
geom_point()
plot(SalesStore$Size,SalesStore$Weekly_Sales, main = "Size vs Sales", xlab = "Store Size", ylab = "Weekly Sales")
SalesStore <- left_join(sales_data_set, stores_data_set, by = "Store")
monthsales<-SalesStore %>% group_by(Month) %>% summarise(montlysales=sum(Weekly_Sales))
monthsales$montlysales <- as.numeric(monthsales$montlysales)
qplot(x =Month , y = montlysales,data = monthsales)
deptSalesdata <- sales_data_set %>% group_by(Dept) %>% summarise(deptSales = sum(Weekly_Sales)) %>% arrange(desc(deptSales))
deptSalesdata$Dept<-as.factor(deptSalesdata$Dept)
deptSalesdata<-data.frame(deptSalesdata)
ggplot(deptSalesdata,aes(x=Dept,y=deptSales,fill=Dept)) +geom_bar(fill="#56B4E6", stat = "identity") + scale_x_discrete(name="Departments") + theme( axis.text.x = element_text(angle =90)) + ggtitle('Sales of the Departments')
features_data_set$Temperature<-as.numeric(as.vector(features_data_set$Temperature))
features_data_set$Unemployment<-as.numeric(as.vector(features_data_set$Unemployment))
features_data_set$Fuel_Price<-as.numeric(as.vector(features_data_set$Fuel_Price))
sales_data_set$Weekly_Sales<-as.numeric(as.vector(sales_data_set$Weekly_Sales))
features_temp_m <- features_data_set %>% group_by(Month) %>% summarise(ort_temp=mean(Temperature))
sales_m <- sales_data_set %>%group_by(Month) %>% summarise(ort_sa=mean(Weekly_Sales))
temp_sales <- inner_join(sales_m,features_temp_m,by="Month")
ggplot(temp_sales, aes(x = Month, y = ort_temp, size = ort_sa)) +
geom_point(shape = 21,colour = "#000000", fill = "#40b8d0")
features_Unem_m <- features_data_set %>% group_by(Month) %>% summarise(avg_une=mean(Unemployment))
Unem_sales <- inner_join(sales_m,features_Unem_m,by="Month")
ggplot(Unem_sales, aes(x = Month, y = avg_une, size = ort_sa)) +
geom_point(shape = 21,colour = "#000000", fill = "#40b8d0")
Looking at the percentages of the montly sales by the stores
Sales<-data.table(sales_data_set)
Features<-data.table(features_data_set)
Stores<-data.table(stores_data_set)
Sales<-Sales[,list(Store,Dept,Date,Weekly_Sales)]
setkey(Sales,Store,Date)
setkey(Features,Store,Date)
Sales<-Features[Sales]
setkey(Sales,Store)
setkey(Stores,Store)
Sales<-Stores[Sales]
str(Sales)
## Classes 'data.table' and 'data.frame': 421570 obs. of 19 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Date : Factor w/ 143 levels "01/04/2011","01/06/2012",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 59.2 59.2 59.2 59.2 59.2 ...
## $ Fuel_Price : num 3.52 3.52 3.52 3.52 3.52 ...
## $ MarkDown1 : Factor w/ 4023 levels "-16.93","-2781.45",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : Factor w/ 2715 levels "-0.01","-0.05",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : Factor w/ 2885 levels "-0.2","-0.73",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : Factor w/ 3405 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : Factor w/ 4045 levels "-185.17","-37.02",..: NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : Factor w/ 2505 levels "126.064","126.0766452",..: 1524 1524 1524 1524 1524 1524 1524 1524 1524 1524 ...
## $ Unemployment: num 7.68 7.68 7.68 7.68 7.68 ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Year : chr "2011" "2011" "2011" "2011" ...
## $ Month : chr "04" "04" "04" "04" ...
## $ Day : chr "01" "01" "01" "01" ...
## $ Dept : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Weekly_Sales: num 20398 46992 8734 34452 23599 ...
## - attr(*, "sorted")= chr "Store"
## - attr(*, ".internal.selfref")=<externalptr>
summary(Sales)
## Store Type Size Date
## Min. : 1.0 A:215478 Min. : 34875 23/12/2011: 3027
## 1st Qu.:11.0 B:163495 1st Qu.: 93638 25/11/2011: 3021
## Median :22.0 C: 42597 Median :140167 16/12/2011: 3013
## Mean :22.2 Mean :136728 09/12/2011: 3010
## 3rd Qu.:33.0 3rd Qu.:202505 17/02/2012: 3007
## Max. :45.0 Max. :219622 30/12/2011: 3003
## (Other) :403489
## Temperature Fuel_Price MarkDown1 MarkDown2
## Min. : -2.06 Min. :2.472 1.5 : 102 1.91 : 539
## 1st Qu.: 46.68 1st Qu.:2.933 460.73 : 102 3 : 493
## Median : 62.09 Median :3.452 175.64 : 93 0.5 : 485
## Mean : 60.09 Mean :3.361 1282.42: 75 1.5 : 471
## 3rd Qu.: 74.28 3rd Qu.:3.738 1483.17: 75 4 : 367
## Max. :100.14 Max. :4.468 (Other):150234 (Other):108893
## NA's :270889 NA's :310322
## MarkDown3 MarkDown4 MarkDown5 CPI
## 3 : 754 9 : 280 2743.18 : 136 129.8555333: 711
## 6 : 710 4 : 200 1064.56 : 120 131.1083333: 708
## 2 : 660 2 : 197 20371.02: 75 129.8459667: 707
## 1 : 611 3 : 146 3557.67 : 75 130.3849032: 706
## 0.22 : 487 47 : 143 3567.03 : 75 130.6457931: 706
## (Other):133869 (Other):134001 (Other) :150951 130.683 : 706
## NA's :284479 NA's :286603 NA's :270138 (Other) :417326
## Unemployment IsHoliday Year Month
## Min. : 3.879 Mode :logical Length:421570 Length:421570
## 1st Qu.: 6.891 FALSE:391909 Class :character Class :character
## Median : 7.866 TRUE :29661 Mode :character Mode :character
## Mean : 7.960
## 3rd Qu.: 8.572
## Max. :14.313
##
## Day Dept Weekly_Sales
## Length:421570 Min. : 1.00 Min. : -4989
## Class :character 1st Qu.:18.00 1st Qu.: 2080
## Mode :character Median :37.00 Median : 7612
## Mean :44.26 Mean : 15981
## 3rd Qu.:74.00 3rd Qu.: 20206
## Max. :99.00 Max. :693099
##
Sales[,Month:=as.numeric(substring(as.character(Date),4,5))]
Sales$Date<-dmy(Sales$Date)
MonthlySales<-Sales[,sum(Weekly_Sales,na.rm = TRUE),.(Store,Month)]
setnames(MonthlySales,"V1","Monthly_Sales")
MonthlySales[,TotalSales:=sum(Monthly_Sales,na.rm = TRUE),.(Month)]
MonthlySales[,SalesPercantage:=Monthly_Sales*1.0/TotalSales]
Clusno<-5
CM=dcast.data.table(MonthlySales,Store~Month,value.var="SalesPercantage")
S<-colnames(CM)
CM<-data.frame(CM)
CM[is.na(CM)]=0
colnames(CM)<-S
CM<-data.table(CM)
# basl<-which(colnames(rr)=="2")
# bitis<-which(colnames(rr)=="269")
set.seed(7)
CM[,clusno:=kmeans(CM[,c(2:ncol(CM)),with=F],Clusno)$cluster]
clusters<-CM[,list(Store,clusno)]
setkey(clusters,Store)
setkey(MonthlySales,Store)
MonthlySales<-clusters[MonthlySales]
SalesP<-dcast.data.table(MonthlySales,Month~Store,value.var="SalesPercantage")
MonthlySales$Month <- factor(MonthlySales$Month)
MonthlySales$Store <- factor(MonthlySales$Store)
MonthlySales$clusno <- factor(MonthlySales$clusno)
# plotting reference lines across each facet:
referenceLines <- MonthlySales # \/ Rename
colnames(referenceLines)[2] <- "groupVar"
zp <- ggplot(MonthlySales,
aes(x = Month, y = SalesPercantage))
zp <- zp + geom_line(data = referenceLines, # Plotting the "underlayer"
aes(x = Month, y = SalesPercantage, group = groupVar),
colour = "GRAY", alpha = 1/2, size = 1/2)
zp <- zp + geom_line(size = 1) # Drawing the "overlayer"
zp <- zp + facet_wrap(~ Store)
zp <- zp + theme_bw()
ggplotly()
ggplot(MonthlySales, aes(x=Month, y=SalesPercantage, color=clusno, group=Store)) +
geom_line()
ggplotly()
YearlySales<-Sales[,sum(Weekly_Sales,na.rm = TRUE),.(Store,Type,Size)]
setnames(YearlySales,"V1","Yearly_Sales")
ggplot(YearlySales,aes(x=Size,y=Yearly_Sales)) +
geom_point()+
geom_smooth(method=lm,color="RED",se = FALSE)+
scale_x_continuous(waiver()) + scale_y_continuous(waiver())
daysales<-Sales %>%
group_by(Day) %>%
summarise(Salesofthedays=sum(Weekly_Sales))
daysales$Day<-as.factor(daysales$Day)
ggplot(daysales,aes(x=Day,y=Salesofthedays,fill=Day)) +geom_bar(fill="#FF6666", stat = "identity") + scale_x_discrete(name="Days") + theme( axis.text.x = element_text(angle =90)) + ggtitle('Sales of the Days')
alldata <- inner_join(SalesStore, features_data_set , by = c("Store", "Year", "Month", "Day"))
selectcol <- alldata %>% select( Size, CPI ,Unemployment ,Fuel_Price, Weekly_Sales, Temperature)
selectcol$CPI <- as.numeric(as.character(selectcol$CPI))
selectcol$Unemployment <- as.numeric(as.character(selectcol$Unemployment))
selectcol$Fuel_Price <- as.numeric(as.character(selectcol$Fuel_Price))
selectcol$Weekly_Sales <- as.numeric(as.character(selectcol$Weekly_Sales))
selectcol$Temperature <- as.numeric(as.character(selectcol$Temperature))
matrixdata <- as.matrix(selectcol)
corrplot(cor(selectcol) ,method = "circle")
###References #_Retail Data Analytics. (2017, August). Retrieved from https://www.kaggle.com/manjeetsingh/retaildataset_